CMS Nursing Home Provider Information - https://data.cms.gov/provider-data/dataset/4pq5-n9py (2017 - 2021)
CMS Medicare Claims Quality Measures - https://data.cms.gov/provider-data/dataset/ijh5-nb2v (April 2020 - March 2021)
CMS Survey Summary - https://data.cms.gov/provider-data/dataset/tbry-pc2d (2017 - 2020)
CMS MDS Quality Measures - https://data.cms.gov/provider-data/dataset/djen-97ju (April 2020 - March 2021)
CMS COVID-19 Nursing Home Data - https://data.cms.gov/covid-19/covid-19-nursing-home-data (April 2020 - December 2020, January 2021 - March 2021)
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as stats
# Set directory
os.chdir('C:/Users/erpin/Documents/CIND820')
p = 'NH_ProviderInfo_Jan2022.csv'
s = 'NH_SurveySummary_Nov2021.csv'
m = 'NH_QualityMsr_MDS_Nov2021.csv'
c = 'NH_QualityMsr_Claims_Nov2021.csv'
cov = 'faclevel_2020.csv'
# Select columns from provider dataset
dfp = pd.read_csv(p, dtype={'Federal Provider Number':'str'}, encoding='cp1252')
colsp= [0,1,4,9,10,11,30,48,50,58,59,60,87,88,89]
dfp = dfp[dfp.columns[colsp]]
dfp['Federal Provider Number'] = dfp['Federal Provider Number'].apply(lambda x: x.lstrip('0'))
# Select columns from MDS dataset
dfm = pd.read_csv(m, dtype={'Federal Provider Number':'str'}, encoding='cp1252')
LS = dfm[dfm['Resident type']== 'Long Stay']
colsm = [0,7,8,17]
LS = LS[LS.columns[colsm]]
LSpivot = LS.pivot(index=['Federal Provider Number', 'Resident type'], columns='Measure Description', values='Four Quarter Average Score')
LSpivot.reset_index(drop=False, inplace=True)
LSpivot['Federal Provider Number'] = LSpivot['Federal Provider Number'].apply(lambda x: x.lstrip('0'))
# Select columns from surveys dataset
dfs = pd.read_csv(s, dtype={'Federal Provider Number':'str'}, encoding='cp1252')
colss = [0,6,7,9,10]
dfs = dfs[dfs.columns[colss]]
dfs = dfs[dfs['Inspection Cycle']==3]
dfs.to_csv('inspection.csv')
dfs['Federal Provider Number'] = dfs['Federal Provider Number'].apply(lambda x: x.lstrip('0'))
#select columns from COVID 2020 dataset
dfcov = pd.read_csv(cov, dtype={'Federal Provider Number':'str'})
colcov = [1,13,19,22]
dfcov = dfcov[dfcov.columns[colcov]]
dfcov['Total Number of Occupied Beds'] = dfcov['Total Number of Occupied Beds'].replace(0, np.nan)
dfcov['Confirmed COVID-19 Cases Per Occupied Beds'] = dfcov['Residents Weekly Confirmed COVID-19']/dfcov['Total Number of Occupied Beds']
dfcov['COVID-19 Deaths Per Occupied Beds'] = dfcov['Residents Weekly COVID-19 Deaths']/dfcov['Total Number of Occupied Beds']
dfcovm = dfcov.groupby(['Federal Provider Number']).mean().reset_index()
dfcovm['Federal Provider Number'] = dfcovm['Federal Provider Number'].apply(lambda x: x.lstrip('0'))
#Select columns from claims dataset
dfc = pd.read_csv(c, dtype={'Federal Provider Number':'str'}, encoding='cp1252')
dfc = dfc[dfc['Measure Description']=='Number of outpatient emergency department visits per 1000 long-stay resident days']
colc = [0,7,9]
dfc = dfc[dfc.columns[colc]]
dfc['Federal Provider Number'] = dfc['Federal Provider Number'].apply(lambda x: x.lstrip('0'))
df1 = pd.merge(dfp, LSpivot, on='Federal Provider Number', how='left')
df2 = pd.merge(df1, dfs, on='Federal Provider Number', how='left')
df3 = pd.merge(df2, dfcovm, on='Federal Provider Number', how='left')
final_df = pd.merge(df3, dfc, on='Federal Provider Number', how='left')
C:\Users\erpin\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3146: DtypeWarning: Columns (84,85) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
#Checking columns
final_df.columns.tolist()
['Federal Provider Number', 'Provider Name', 'Provider State', 'Ownership Type', 'Number of Certified Beds', 'Average Number of Residents per Day', 'Long-Stay QM Rating', 'Total nursing staff turnover', 'Registered Nurse turnover', 'Adjusted Nurse Aide Staffing Hours per Resident per Day', 'Adjusted LPN Staffing Hours per Resident per Day', 'Adjusted RN Staffing Hours per Resident per Day', 'Number of Facility Reported Incidents', 'Number of Substantiated Complaints', 'Number of Citations from Infection Control Inspections', 'Resident type', 'Percentage of high risk long-stay residents with pressure ulcers', 'Percentage of long-stay residents assessed and appropriately given the pneumococcal vaccine', 'Percentage of long-stay residents assessed and appropriately given the seasonal influenza vaccine', 'Percentage of long-stay residents experiencing one or more falls with major injury', 'Percentage of long-stay residents who have depressive symptoms', 'Percentage of long-stay residents who lose too much weight', 'Percentage of long-stay residents who received an antianxiety or hypnotic medication', 'Percentage of long-stay residents who received an antipsychotic medication', 'Percentage of long-stay residents who were physically restrained', 'Percentage of long-stay residents whose ability to move independently worsened', 'Percentage of long-stay residents whose need for help with daily activities has increased', 'Percentage of long-stay residents with a catheter inserted and left in their bladder', 'Percentage of long-stay residents with a urinary tract infection', 'Percentage of low risk long-stay residents who lose control of their bowels or bladder', 'Inspection Cycle', 'Health Survey Date', 'Total Number of Health Deficiencies', 'Total Number of Fire Safety Deficiencies', 'Residents Weekly Confirmed COVID-19', 'Residents Weekly COVID-19 Deaths', 'Total Number of Occupied Beds', 'Confirmed COVID-19 Cases Per Occupied Beds', 'COVID-19 Deaths Per Occupied Beds', 'Measure Description', 'Adjusted Score']
# Rename 'Adjusted Score' to 'Number of outpatient emergency department visits per 1000 long-stay resident days'
final_df.rename(columns={'Adjusted Score': 'Number of outpatient emergency department visits per 1000 long-stay resident days'}, inplace=True)
#Remove unnecessary columns - Resident type, Inspection Cycle, Measure Description, Residents Weekly Confirmed COVID-19, Residents Weekly COVID-19 Deaths, Total Number of Occupied Beds
final_df.drop(['Resident type', 'Inspection Cycle', 'Measure Description', 'Residents Weekly Confirmed COVID-19', 'Residents Weekly COVID-19 Deaths', 'Total Number of Occupied Beds', 'Health Survey Date', 'Provider State'], axis=1, inplace=True)
# Count number of rows - 15238
len(final_df.index)
15238
#Check for duplicate rows - No duplicates
final_df[final_df['Federal Provider Number'].duplicated() == True].sort_values('Federal Provider Number')
| Federal Provider Number | Provider Name | Ownership Type | Number of Certified Beds | Average Number of Residents per Day | Long-Stay QM Rating | Total nursing staff turnover | Registered Nurse turnover | Adjusted Nurse Aide Staffing Hours per Resident per Day | Adjusted LPN Staffing Hours per Resident per Day | ... | Percentage of long-stay residents whose ability to move independently worsened | Percentage of long-stay residents whose need for help with daily activities has increased | Percentage of long-stay residents with a catheter inserted and left in their bladder | Percentage of long-stay residents with a urinary tract infection | Percentage of low risk long-stay residents who lose control of their bowels or bladder | Total Number of Health Deficiencies | Total Number of Fire Safety Deficiencies | Confirmed COVID-19 Cases Per Occupied Beds | COVID-19 Deaths Per Occupied Beds | Number of outpatient emergency department visits per 1000 long-stay resident days |
|---|
0 rows × 33 columns
#Check for null values in dependent value - 2241 rows
final_df['Number of outpatient emergency department visits per 1000 long-stay resident days'].isna().sum()
#Remove 2241 rows with null dependent value
mod_df = final_df.dropna(how='any', subset=['Number of outpatient emergency department visits per 1000 long-stay resident days'])
#Nulls per column
colnulls = mod_df.isnull().sum(axis=0).to_frame(name='Column Nulls')
#Nulls per column
colnulls = mod_df.isnull().sum(axis=0).to_frame(name='Column Nulls')
# Count nulls per row
rownulls = mod_df.set_index('Provider Name').isnull().sum(axis=1)
rownulls.count()
rownulls[rownulls > 15].count()
rownulls.sort_values(ascending=False)
Provider Name
HEDDINGTON OAKS 20
GOLDEN CROSS HEALTH CARE 20
GREENE COUNTY NURSING HOME 20
THE VILLAGES OF GENERAL BAPTIST HEALTH CARE EAST 20
ROLLING MEADOWS 17
..
HARROGATE 0
LAKELAND HEALTH CARE CENTER 0
ASPEN HILLS HEALTHCARE CENTER 0
PROMEDICA SKILLED NURSING & REHAB - MOUNTAINSIDE 0
LIFE CARE CENTER OF SULLIVAN 0
Length: 12997, dtype: int64
# Coding the 'Ownership' Categorical Variable in prepartion for dummy variable creation
mod_df['Ownership Type'].value_counts()
categorizing_ownership = {'Ownership Type': {
'For profit - Corporation': 1,
'Non profit - Corporation': 0,
'For profit - Limited Liability company': 1,
'For profit - Individual': 1,
'For profit - Partnership': 1,
'Government - County': 2,
'Non profit - Church related': 0,
'Non profit - Other': 0,
'Government - Hospital district': 2,
'Government - State': 2,
'Government - City': 2,
'Government - City/county': 2,
'Government - Federal': 2}}
mod_df = mod_df.replace(categorizing_ownership)
mod_df.head()
| Federal Provider Number | Provider Name | Ownership Type | Number of Certified Beds | Average Number of Residents per Day | Long-Stay QM Rating | Total nursing staff turnover | Registered Nurse turnover | Adjusted Nurse Aide Staffing Hours per Resident per Day | Adjusted LPN Staffing Hours per Resident per Day | ... | Percentage of long-stay residents whose ability to move independently worsened | Percentage of long-stay residents whose need for help with daily activities has increased | Percentage of long-stay residents with a catheter inserted and left in their bladder | Percentage of long-stay residents with a urinary tract infection | Percentage of low risk long-stay residents who lose control of their bowels or bladder | Total Number of Health Deficiencies | Total Number of Fire Safety Deficiencies | Confirmed COVID-19 Cases Per Occupied Beds | COVID-19 Deaths Per Occupied Beds | Number of outpatient emergency department visits per 1000 long-stay resident days | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 15009 | BURNS NURSING HOME, INC. | 1 | 57 | 53.2 | 5.0 | NaN | NaN | NaN | NaN | ... | 28.180706 | 15.060243 | 0.738709 | 0.555555 | 66.000001 | 0.0 | 2.0 | 0.032987 | 0.003443 | 0.272998 |
| 1 | 15010 | COOSA VALLEY HEALTHCARE CENTER | 1 | 85 | 79.8 | 2.0 | 52.4 | 21.4 | 2.49541 | 0.83163 | ... | 14.764966 | 8.837209 | 2.067165 | 4.016065 | 42.857145 | 7.0 | 3.0 | 0.008493 | 0.000963 | 0.481088 |
| 2 | 15012 | HIGHLANDS HEALTH AND REHAB | 2 | 50 | 39.9 | 3.0 | NaN | NaN | NaN | NaN | ... | 31.351583 | 12.500002 | 4.839041 | 11.016951 | 75.471696 | 5.0 | 0.0 | 0.009605 | 0.003024 | 0.988491 |
| 3 | 15014 | EASTVIEW REHABILITATION & HEALTHCARE CENTER | 1 | 92 | 69.4 | 2.0 | NaN | NaN | 2.02560 | 0.98558 | ... | 36.218930 | 21.198155 | 2.700401 | 7.251909 | 33.333335 | 6.0 | 5.0 | 0.014678 | 0.000000 | 0.866367 |
| 4 | 15015 | PLANTATION MANOR NURSING HOME | 1 | 103 | 72.4 | 3.0 | 51.3 | 60.0 | 2.36250 | 1.28006 | ... | 10.404354 | 10.599077 | 0.560731 | 3.767122 | 32.323234 | 7.0 | 4.0 | 0.004360 | 0.000000 | 0.640404 |
5 rows × 33 columns
#Create dummy variables
clean_df = pd.get_dummies(mod_df, columns=['Ownership Type', 'Long-Stay QM Rating'])
clean_df.head()
| Federal Provider Number | Provider Name | Number of Certified Beds | Average Number of Residents per Day | Total nursing staff turnover | Registered Nurse turnover | Adjusted Nurse Aide Staffing Hours per Resident per Day | Adjusted LPN Staffing Hours per Resident per Day | Adjusted RN Staffing Hours per Resident per Day | Number of Facility Reported Incidents | ... | COVID-19 Deaths Per Occupied Beds | Number of outpatient emergency department visits per 1000 long-stay resident days | Ownership Type_0 | Ownership Type_1 | Ownership Type_2 | Long-Stay QM Rating_1.0 | Long-Stay QM Rating_2.0 | Long-Stay QM Rating_3.0 | Long-Stay QM Rating_4.0 | Long-Stay QM Rating_5.0 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 15009 | BURNS NURSING HOME, INC. | 57 | 53.2 | NaN | NaN | NaN | NaN | NaN | 0 | ... | 0.003443 | 0.272998 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 1 | 15010 | COOSA VALLEY HEALTHCARE CENTER | 85 | 79.8 | 52.4 | 21.4 | 2.49541 | 0.83163 | 1.17810 | 0 | ... | 0.000963 | 0.481088 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 15012 | HIGHLANDS HEALTH AND REHAB | 50 | 39.9 | NaN | NaN | NaN | NaN | NaN | 0 | ... | 0.003024 | 0.988491 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 3 | 15014 | EASTVIEW REHABILITATION & HEALTHCARE CENTER | 92 | 69.4 | NaN | NaN | 2.02560 | 0.98558 | 0.53602 | 0 | ... | 0.000000 | 0.866367 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| 4 | 15015 | PLANTATION MANOR NURSING HOME | 103 | 72.4 | 51.3 | 60.0 | 2.36250 | 1.28006 | 0.50438 | 0 | ... | 0.000000 | 0.640404 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
5 rows × 39 columns
#Dataset shape
clean_df.shape
(12997, 39)
#Dataset info
clean_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 12997 entries, 0 to 15235 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Federal Provider Number 12997 non-null object 1 Provider Name 12997 non-null object 2 Number of Certified Beds 12997 non-null int64 3 Average Number of Residents per Day 12973 non-null float64 4 Total nursing staff turnover 11052 non-null float64 5 Registered Nurse turnover 10350 non-null float64 6 Adjusted Nurse Aide Staffing Hours per Resident per Day 12645 non-null float64 7 Adjusted LPN Staffing Hours per Resident per Day 12645 non-null float64 8 Adjusted RN Staffing Hours per Resident per Day 12645 non-null float64 9 Number of Facility Reported Incidents 12997 non-null int64 10 Number of Substantiated Complaints 12997 non-null int64 11 Number of Citations from Infection Control Inspections 12993 non-null float64 12 Percentage of high risk long-stay residents with pressure ulcers 12960 non-null float64 13 Percentage of long-stay residents assessed and appropriately given the pneumococcal vaccine 12991 non-null float64 14 Percentage of long-stay residents assessed and appropriately given the seasonal influenza vaccine 12996 non-null float64 15 Percentage of long-stay residents experiencing one or more falls with major injury 12991 non-null float64 16 Percentage of long-stay residents who have depressive symptoms 12986 non-null float64 17 Percentage of long-stay residents who lose too much weight 12981 non-null float64 18 Percentage of long-stay residents who received an antianxiety or hypnotic medication 12990 non-null float64 19 Percentage of long-stay residents who received an antipsychotic medication 12973 non-null float64 20 Percentage of long-stay residents who were physically restrained 12989 non-null float64 21 Percentage of long-stay residents whose ability to move independently worsened 12918 non-null float64 22 Percentage of long-stay residents whose need for help with daily activities has increased 12979 non-null float64 23 Percentage of long-stay residents with a catheter inserted and left in their bladder 12986 non-null float64 24 Percentage of long-stay residents with a urinary tract infection 12987 non-null float64 25 Percentage of low risk long-stay residents who lose control of their bowels or bladder 12764 non-null float64 26 Total Number of Health Deficiencies 12900 non-null float64 27 Total Number of Fire Safety Deficiencies 12883 non-null float64 28 Confirmed COVID-19 Cases Per Occupied Beds 12900 non-null float64 29 COVID-19 Deaths Per Occupied Beds 12900 non-null float64 30 Number of outpatient emergency department visits per 1000 long-stay resident days 12997 non-null float64 31 Ownership Type_0 12997 non-null uint8 32 Ownership Type_1 12997 non-null uint8 33 Ownership Type_2 12997 non-null uint8 34 Long-Stay QM Rating_1.0 12997 non-null uint8 35 Long-Stay QM Rating_2.0 12997 non-null uint8 36 Long-Stay QM Rating_3.0 12997 non-null uint8 37 Long-Stay QM Rating_4.0 12997 non-null uint8 38 Long-Stay QM Rating_5.0 12997 non-null uint8 dtypes: float64(26), int64(3), object(2), uint8(8) memory usage: 3.3+ MB
Both Total nursing staff turnover and Registered Nurse turnover attributes have a large, but acceptable number of nulls from 1000-2000 rows.
#Describe dataset
clean_df.describe()
# .to_csv("DescribeDataset.csv")
| Number of Certified Beds | Average Number of Residents per Day | Total nursing staff turnover | Registered Nurse turnover | Adjusted Nurse Aide Staffing Hours per Resident per Day | Adjusted LPN Staffing Hours per Resident per Day | Adjusted RN Staffing Hours per Resident per Day | Number of Facility Reported Incidents | Number of Substantiated Complaints | Number of Citations from Infection Control Inspections | ... | COVID-19 Deaths Per Occupied Beds | Number of outpatient emergency department visits per 1000 long-stay resident days | Ownership Type_0 | Ownership Type_1 | Ownership Type_2 | Long-Stay QM Rating_1.0 | Long-Stay QM Rating_2.0 | Long-Stay QM Rating_3.0 | Long-Stay QM Rating_4.0 | Long-Stay QM Rating_5.0 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 12997.000000 | 12973.000000 | 11052.000000 | 10350.000000 | 12645.000000 | 12645.000000 | 12645.000000 | 12997.000000 | 12997.000000 | 12993.000000 | ... | 12900.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 |
| mean | 113.143649 | 81.455230 | 51.273127 | 49.536705 | 2.171593 | 0.870455 | 0.675686 | 1.539432 | 4.832808 | 1.357269 | ... | 0.003258 | 0.701247 | 0.221282 | 0.713549 | 0.065169 | 0.091790 | 0.149034 | 0.174733 | 0.222282 | 0.353081 |
| std | 59.809910 | 46.666475 | 15.068248 | 21.247156 | 0.584521 | 0.339521 | 0.403360 | 3.578529 | 9.157541 | 2.368291 | ... | 0.004708 | 0.480244 | 0.415126 | 0.452120 | 0.246833 | 0.288741 | 0.356136 | 0.379753 | 0.415796 | 0.477946 |
| min | 18.000000 | 1.000000 | 6.100000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 74.000000 | 51.000000 | 40.900000 | 33.300000 | 1.788230 | 0.648650 | 0.400500 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000313 | 0.364782 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 104.000000 | 73.700000 | 50.700000 | 50.000000 | 2.110130 | 0.853320 | 0.585050 | 0.000000 | 2.000000 | 1.000000 | ... | 0.002016 | 0.595768 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 132.000000 | 99.300000 | 60.900000 | 64.000000 | 2.495520 | 1.073060 | 0.855110 | 2.000000 | 5.000000 | 2.000000 | ... | 0.004805 | 0.922189 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| max | 874.000000 | 703.300000 | 100.000000 | 100.000000 | 6.903020 | 3.636150 | 5.580270 | 72.000000 | 162.000000 | 43.000000 | ... | 0.263780 | 5.477409 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 37 columns
# Visualize distributions
clean_df.hist(bins=30, figsize=(30,25), layout=(13,3));
On visual inspection, it can be noted that ~15 plots are approximately normally distributed, however a number of attributes are not normally distributed or have notably long right- or left-tails. This will be further addressed in assumption testing.
Since the data was manually reported by nursing homes regarding their performance, a cautious approach will be taken for outlier detection and removal.
#Visualizing outliers
outlier_df = clean_df.drop(['Federal Provider Number', 'Provider Name'], axis=1).copy()
for i in outlier_df:
plt.figure()
outlier_df.boxplot([i])
<ipython-input-23-e1c281880675>:4: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). plt.figure()
On visual analysis, there are a varying amount of outliers in each attribute, however the majority of attributes have outliers on one-tail, resulting in skew. To quantify this, the skewness will be calculated below:
#Evaluating skewness of attributes
outlier_df.skew()
Number of Certified Beds 2.814405 Average Number of Residents per Day 2.784565 Total nursing staff turnover 0.356678 Registered Nurse turnover 0.149541 Adjusted Nurse Aide Staffing Hours per Resident per Day 0.709330 Adjusted LPN Staffing Hours per Resident per Day 0.584469 Adjusted RN Staffing Hours per Resident per Day 1.922657 Number of Facility Reported Incidents 5.765617 Number of Substantiated Complaints 4.545777 Number of Citations from Infection Control Inspections 4.017724 Percentage of high risk long-stay residents with pressure ulcers 0.891290 Percentage of long-stay residents assessed and appropriately given the pneumococcal vaccine -3.197396 Percentage of long-stay residents assessed and appropriately given the seasonal influenza vaccine -3.938107 Percentage of long-stay residents experiencing one or more falls with major injury 1.146353 Percentage of long-stay residents who have depressive symptoms 3.685780 Percentage of long-stay residents who lose too much weight 0.974590 Percentage of long-stay residents who received an antianxiety or hypnotic medication 0.825813 Percentage of long-stay residents who received an antipsychotic medication 2.149230 Percentage of long-stay residents who were physically restrained 36.761392 Percentage of long-stay residents whose ability to move independently worsened 0.462988 Percentage of long-stay residents whose need for help with daily activities has increased 0.481214 Percentage of long-stay residents with a catheter inserted and left in their bladder 2.690169 Percentage of long-stay residents with a urinary tract infection 2.122940 Percentage of low risk long-stay residents who lose control of their bowels or bladder 0.009498 Total Number of Health Deficiencies 1.495910 Total Number of Fire Safety Deficiencies 1.749042 Confirmed COVID-19 Cases Per Occupied Beds 8.727233 COVID-19 Deaths Per Occupied Beds 15.602656 Number of outpatient emergency department visits per 1000 long-stay resident days 1.561035 Ownership Type_0 1.343020 Ownership Type_1 -0.944803 Ownership Type_2 3.523822 Long-Stay QM Rating_1.0 2.827949 Long-Stay QM Rating_2.0 1.971267 Long-Stay QM Rating_3.0 1.713311 Long-Stay QM Rating_4.0 1.336044 Long-Stay QM Rating_5.0 0.614886 dtype: float64
There is a significant amount of skew in 9 numeric attributes. To address this, right-skewed attributes values with a skew > 3 greater will have values greater than the 90th percentile replaced by the median. Left-skewed values with a skew < -3 will have values less than the 10th percentile replaced by the median. Categorical dummy variables will not be included.
#Addressing skew and testing resulting skewness
Q10 = outlier_df.quantile(0.1)
Q50 = outlier_df.quantile(0.5)
Q90 = outlier_df.quantile(0.9)
outlier_df['Number of Facility Reported Incidents'] = np.where(outlier_df['Number of Facility Reported Incidents'] > Q90['Number of Facility Reported Incidents'], Q50['Number of Facility Reported Incidents'], outlier_df['Number of Facility Reported Incidents'])
outlier_df['Number of Substantiated Complaints'] = np.where(outlier_df['Number of Substantiated Complaints'] > Q90['Number of Substantiated Complaints'], Q50['Number of Substantiated Complaints'], outlier_df['Number of Substantiated Complaints'])
outlier_df['Number of Citations from Infection Control Inspections'] = np.where(outlier_df['Number of Citations from Infection Control Inspections'] > Q90['Number of Citations from Infection Control Inspections'], Q50['Number of Citations from Infection Control Inspections'], outlier_df['Number of Citations from Infection Control Inspections'])
outlier_df['Percentage of long-stay residents who have depressive symptoms'] = np.where(outlier_df['Percentage of long-stay residents who have depressive symptoms'] > Q90['Percentage of long-stay residents who have depressive symptoms'], Q50['Percentage of long-stay residents who have depressive symptoms'], outlier_df['Percentage of long-stay residents who have depressive symptoms'])
outlier_df['Percentage of long-stay residents who were physically restrained'] = np.where(outlier_df['Percentage of long-stay residents who were physically restrained'] > Q90['Percentage of long-stay residents who were physically restrained'], Q50['Percentage of long-stay residents who were physically restrained'], outlier_df['Percentage of long-stay residents who were physically restrained'])
outlier_df['Confirmed COVID-19 Cases Per Occupied Beds'] = np.where(outlier_df['Confirmed COVID-19 Cases Per Occupied Beds'] > Q90['Confirmed COVID-19 Cases Per Occupied Beds'], Q50['Confirmed COVID-19 Cases Per Occupied Beds'], outlier_df['Confirmed COVID-19 Cases Per Occupied Beds'])
outlier_df['COVID-19 Deaths Per Occupied Beds'] = np.where(outlier_df['COVID-19 Deaths Per Occupied Beds'] > Q90['COVID-19 Deaths Per Occupied Beds'], Q50['COVID-19 Deaths Per Occupied Beds'], outlier_df['COVID-19 Deaths Per Occupied Beds'])
outlier_df['Percentage of long-stay residents assessed and appropriately given the pneumococcal vaccine'] = np.where(outlier_df['Percentage of long-stay residents assessed and appropriately given the pneumococcal vaccine'] < Q10['Percentage of long-stay residents assessed and appropriately given the pneumococcal vaccine'], Q50['Percentage of long-stay residents assessed and appropriately given the pneumococcal vaccine'], outlier_df['Percentage of long-stay residents assessed and appropriately given the pneumococcal vaccine'])
outlier_df['Percentage of long-stay residents assessed and appropriately given the seasonal influenza vaccine'] = np.where(outlier_df['Percentage of long-stay residents assessed and appropriately given the seasonal influenza vaccine'] < Q10['Percentage of long-stay residents assessed and appropriately given the seasonal influenza vaccine'], Q50['Percentage of long-stay residents assessed and appropriately given the seasonal influenza vaccine'], outlier_df['Percentage of long-stay residents assessed and appropriately given the seasonal influenza vaccine'])
outlier_df.skew()
Number of Certified Beds 2.814405 Average Number of Residents per Day 2.784565 Total nursing staff turnover 0.356678 Registered Nurse turnover 0.149541 Adjusted Nurse Aide Staffing Hours per Resident per Day 0.709330 Adjusted LPN Staffing Hours per Resident per Day 0.584469 Adjusted RN Staffing Hours per Resident per Day 1.922657 Number of Facility Reported Incidents 1.761081 Number of Substantiated Complaints 1.596168 Number of Citations from Infection Control Inspections 1.273226 Percentage of high risk long-stay residents with pressure ulcers 0.891290 Percentage of long-stay residents assessed and appropriately given the pneumococcal vaccine -1.877671 Percentage of long-stay residents assessed and appropriately given the seasonal influenza vaccine -1.249275 Percentage of long-stay residents experiencing one or more falls with major injury 1.146353 Percentage of long-stay residents who have depressive symptoms 1.671901 Percentage of long-stay residents who lose too much weight 0.974590 Percentage of long-stay residents who received an antianxiety or hypnotic medication 0.825813 Percentage of long-stay residents who received an antipsychotic medication 2.149230 Percentage of long-stay residents who were physically restrained 0.000000 Percentage of long-stay residents whose ability to move independently worsened 0.462988 Percentage of long-stay residents whose need for help with daily activities has increased 0.481214 Percentage of long-stay residents with a catheter inserted and left in their bladder 2.690169 Percentage of long-stay residents with a urinary tract infection 2.122940 Percentage of low risk long-stay residents who lose control of their bowels or bladder 0.009498 Total Number of Health Deficiencies 1.495910 Total Number of Fire Safety Deficiencies 1.749042 Confirmed COVID-19 Cases Per Occupied Beds 0.164549 COVID-19 Deaths Per Occupied Beds 0.865815 Number of outpatient emergency department visits per 1000 long-stay resident days 1.561035 Ownership Type_0 1.343020 Ownership Type_1 -0.944803 Ownership Type_2 3.523822 Long-Stay QM Rating_1.0 2.827949 Long-Stay QM Rating_2.0 1.971267 Long-Stay QM Rating_3.0 1.713311 Long-Stay QM Rating_4.0 1.336044 Long-Stay QM Rating_5.0 0.614886 dtype: float64
outlier_df.describe()
| Number of Certified Beds | Average Number of Residents per Day | Total nursing staff turnover | Registered Nurse turnover | Adjusted Nurse Aide Staffing Hours per Resident per Day | Adjusted LPN Staffing Hours per Resident per Day | Adjusted RN Staffing Hours per Resident per Day | Number of Facility Reported Incidents | Number of Substantiated Complaints | Number of Citations from Infection Control Inspections | ... | COVID-19 Deaths Per Occupied Beds | Number of outpatient emergency department visits per 1000 long-stay resident days | Ownership Type_0 | Ownership Type_1 | Ownership Type_2 | Long-Stay QM Rating_1.0 | Long-Stay QM Rating_2.0 | Long-Stay QM Rating_3.0 | Long-Stay QM Rating_4.0 | Long-Stay QM Rating_5.0 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 12997.000000 | 12973.000000 | 11052.000000 | 10350.000000 | 12645.000000 | 12645.000000 | 12645.000000 | 12997.000000 | 12997.000000 | 12993.000000 | ... | 12900.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 |
| mean | 113.143649 | 81.455230 | 51.273127 | 49.536705 | 2.171593 | 0.870455 | 0.675686 | 0.611834 | 2.417481 | 0.859309 | ... | 0.002229 | 0.701247 | 0.221282 | 0.713549 | 0.065169 | 0.091790 | 0.149034 | 0.174733 | 0.222282 | 0.353081 |
| std | 59.809910 | 46.666475 | 15.068248 | 21.247156 | 0.584521 | 0.339521 | 0.403360 | 1.054702 | 3.017783 | 1.072238 | ... | 0.002134 | 0.480244 | 0.415126 | 0.452120 | 0.246833 | 0.288741 | 0.356136 | 0.379753 | 0.415796 | 0.477946 |
| min | 18.000000 | 1.000000 | 6.100000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 74.000000 | 51.000000 | 40.900000 | 33.300000 | 1.788230 | 0.648650 | 0.400500 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000313 | 0.364782 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 104.000000 | 73.700000 | 50.700000 | 50.000000 | 2.110130 | 0.853320 | 0.585050 | 0.000000 | 2.000000 | 1.000000 | ... | 0.002016 | 0.595768 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 132.000000 | 99.300000 | 60.900000 | 64.000000 | 2.495520 | 1.073060 | 0.855110 | 1.000000 | 3.000000 | 1.000000 | ... | 0.003504 | 0.922189 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| max | 874.000000 | 703.300000 | 100.000000 | 100.000000 | 6.903020 | 3.636150 | 5.580270 | 4.000000 | 13.000000 | 4.000000 | ... | 0.007999 | 5.477409 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 37 columns
Upon visual analysis of the data description, it is apparently that the scale of the various attributes varies significantly. It is imperative that the data be normalized.
#Data Normalization using the min-max method
normalized_df = outlier_df.copy()
for column in normalized_df.columns:
normalized_df[column] = (normalized_df[column] - normalized_df[column].min())/(normalized_df[column].max() - normalized_df[column].min())
normalized_df.describe()
| Number of Certified Beds | Average Number of Residents per Day | Total nursing staff turnover | Registered Nurse turnover | Adjusted Nurse Aide Staffing Hours per Resident per Day | Adjusted LPN Staffing Hours per Resident per Day | Adjusted RN Staffing Hours per Resident per Day | Number of Facility Reported Incidents | Number of Substantiated Complaints | Number of Citations from Infection Control Inspections | ... | COVID-19 Deaths Per Occupied Beds | Number of outpatient emergency department visits per 1000 long-stay resident days | Ownership Type_0 | Ownership Type_1 | Ownership Type_2 | Long-Stay QM Rating_1.0 | Long-Stay QM Rating_2.0 | Long-Stay QM Rating_3.0 | Long-Stay QM Rating_4.0 | Long-Stay QM Rating_5.0 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 12997.000000 | 12973.000000 | 11052.000000 | 10350.000000 | 12645.000000 | 12645.000000 | 12645.000000 | 12997.000000 | 12997.000000 | 12993.000000 | ... | 12900.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 | 12997.000000 |
| mean | 0.111149 | 0.114560 | 0.481077 | 0.495367 | 0.314586 | 0.239389 | 0.121085 | 0.152958 | 0.185960 | 0.214827 | ... | 0.278669 | 0.128025 | 0.221282 | 0.713549 | 0.065169 | 0.091790 | 0.149034 | 0.174733 | 0.222282 | 0.353081 |
| std | 0.069871 | 0.066448 | 0.160471 | 0.212472 | 0.084676 | 0.093374 | 0.072283 | 0.263675 | 0.232137 | 0.268059 | ... | 0.266738 | 0.087677 | 0.415126 | 0.452120 | 0.246833 | 0.288741 | 0.356136 | 0.379753 | 0.415796 | 0.477946 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.065421 | 0.071195 | 0.370607 | 0.333000 | 0.259050 | 0.178389 | 0.071771 | 0.000000 | 0.000000 | 0.000000 | ... | 0.039065 | 0.066598 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 0.100467 | 0.103517 | 0.474973 | 0.500000 | 0.305682 | 0.234677 | 0.104843 | 0.000000 | 0.153846 | 0.250000 | ... | 0.252043 | 0.108768 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 0.133178 | 0.139969 | 0.583600 | 0.640000 | 0.361511 | 0.295109 | 0.153238 | 0.250000 | 0.230769 | 0.250000 | ... | 0.438009 | 0.168362 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 37 columns
Dataset is normalized to [0,1]
# Visualize distributions
normalized_df.hist(bins=30, figsize=(30,25), layout=(13,3));
As stated above, ~15 plots are approximately normally distributed, however a number of attributes are not normally distributed or have notably long right- or left-tails. The variables that are not normally distributed will be log transformed to try to address skew.
#Identify and transform non-normal columns
normalized_df[['Number of Facility Reported Incidents',
'Number of Substantiated Complaints',
'Number of Citations from Infection Control Inspections',
'Percentage of long-stay residents assessed and appropriately given the pneumococcal vaccine',
'Percentage of long-stay residents assessed and appropriately given the seasonal influenza vaccine',
'Percentage of long-stay residents who have depressive symptoms',
'Percentage of long-stay residents who were physically restrained',
'Percentage of long-stay residents with a catheter inserted and left in their bladder',
'Percentage of long-stay residents with a urinary tract infection',
'Total Number of Health Deficiencies',
'Total Number of Fire Safety Deficiencies',
'Confirmed COVID-19 Cases Per Occupied Beds',
'COVID-19 Deaths Per Occupied Beds']] = clean_df[['Number of Facility Reported Incidents',
'Number of Substantiated Complaints',
'Number of Citations from Infection Control Inspections',
'Percentage of long-stay residents assessed and appropriately given the pneumococcal vaccine',
'Percentage of long-stay residents assessed and appropriately given the seasonal influenza vaccine',
'Percentage of long-stay residents who have depressive symptoms',
'Percentage of long-stay residents who were physically restrained',
'Percentage of long-stay residents with a catheter inserted and left in their bladder',
'Percentage of long-stay residents with a urinary tract infection',
'Total Number of Health Deficiencies',
'Total Number of Fire Safety Deficiencies',
'Confirmed COVID-19 Cases Per Occupied Beds',
'COVID-19 Deaths Per Occupied Beds']].applymap(lambda x: np.log(1+x))
#Check distributions of transformed data set
normalized_df.hist(bins=30, figsize=(30,25), layout=(13,3));
After transformation, distributions remain non-normal for multiple attributes.
g = sns.pairplot(normalized_df)